Libraries required for this analysis

knitr::opts_chunk$set(fig.align="center") 
library(rstanarm)
library(tidyverse)
library(tidybayes)
library(modelr) 
library(ggplot2)
library(magrittr)  
library(emmeans)
library(bayesplot)
library(brms)
library(gganimate)

theme_set(theme_light())


source('helper_functions.R')

In our experiment, we used a visualization recommendation algorithm (composed of one search algorithm and one oracle algorithm) to generate visualizations for the user on one of two datasets. We then asked the user to evaluate the tool on a variety of metrics (confidence in understanding data, confidence in answer, efficiency, ease of use, utility, and overall).

Given a search algorithm (bfs or dfs), an oracle (CompassQL or dziban), and a dataset (birdstrikes or movies), we would like to predict a user’s score for a given metric. In addition, we would like to know if the choice of search algorithm and oracle, as well as participant group (student or professional) has any meaningful impact on a user’s rating for these metrics.

Our weakly-informative prior (normal(0.26, 1.26)) was derived from pilot studies, and it summarizes the user rating for each metric. Because our pilot study was small, we chose to aggregate our data (rather than deriving separate priors for each metric) to minimize the effect of biases.

Since ratings can have values between -2 and 2 inclusive, we perform ordinal regression.

Read in and clean data

analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")
confidence_metrics = c("confidence.udata", "confidence.ans")
preference_metrics = c("efficiency", "ease.of.use", "utility", "overall")

user_response_data <- read.csv('data/ptask_responses.csv')
analyses = c("confidence.udata", "confidence.ans", "efficiency", "ease.of.use", "utility", "overall")

user_response_data$oracle<- gsub('compassql', 'CompassQL', user_response_data$oracle)
user_response_data$oracle<- gsub('dziban', 'Dziban', user_response_data$oracle)

user_response_data$search<- gsub('bfs', 'BFS', user_response_data$search)
user_response_data$search<- gsub('dfs', 'DFS', user_response_data$search)

user_response_data[,analyses] <- lapply(user_response_data[,analyses],ordered)
user_response_data <- user_response_data %>%
  mutate(
    dataset = as.factor(dataset),
    oracle = as.factor(oracle),
    search = as.factor(search),
    task = as.factor(task)
  )

models <- list()

search_differences <- list()
oracle_differences <- list()
alg_differences <- list()
participant_group_differences <- list()

seed = 12

Analysis for user responses

Confidence in Understanding Data: Building a Model

models$confidence_udata <- brm(
    formula = bf(confidence.udata ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
    prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/confidence_udata",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$confidence_udata)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: confidence.udata ~ oracle * search + dataset + task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 288) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 72) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.12      0.16     0.83     1.45 1.00      888     1378
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]                -2.14      0.43    -2.98    -1.34 1.00     1223
## Intercept[2]                -0.95      0.41    -1.75    -0.17 1.00     1142
## Intercept[3]                 1.25      0.41     0.46     2.05 1.00     1192
## oracleDziban                -0.00      0.44    -0.87     0.88 1.00      855
## searchDFS                   -0.65      0.42    -1.49     0.15 1.00      930
## datasetmovies                0.16      0.30    -0.41     0.76 1.00     1230
## task2.RetrieveValue          0.27      0.20    -0.14     0.66 1.00     2184
## task3.Prediction             0.12      0.20    -0.27     0.52 1.00     2585
## task4.Exploration            0.57      0.21     0.16     0.96 1.00     2068
## participant_groupstudent     0.19      0.30    -0.38     0.76 1.00     1091
## oracleDziban:searchDFS       0.82      0.61    -0.36     1.96 1.00      845
##                          Tail_ESS
## Intercept[1]                 1613
## Intercept[2]                 1763
## Intercept[3]                 1575
## oracleDziban                 1302
## searchDFS                    1488
## datasetmovies                1604
## task2.RetrieveValue          2192
## task3.Prediction             2310
## task4.Exploration            2031
## participant_groupstudent     1747
## oracleDziban:searchDFS       1524
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$confidence_udata)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$confidence_udata,
  pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for confidence in understanding the data using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

confidence_udata_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_udata, NULL, "Oracle/Search Combination", "Rating")
confidence_udata_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

confidence_udata_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL  1.11   0.792  1.42    0.95 mean   qi       
## 2 BFS    Dziban     1.11   0.792  1.42    0.95 mean   qi       
## 3 DFS    CompassQL  0.794  0.444  1.10    0.95 mean   qi       
## 4 DFS    Dziban     1.19   0.889  1.49    0.95 mean   qi       
## 5 BFS    CompassQL  1.11   1      1.21    0.5  mean   qi       
## 6 BFS    Dziban     1.11   1      1.22    0.5  mean   qi       
## 7 DFS    CompassQL  0.794  0.694  0.903   0.5  mean   qi       
## 8 DFS    Dziban     1.19   1.08   1.29    0.5  mean   qi
## Saving 7 x 5 in image

Confidence in Understanding Data: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

confidence_udata_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_udata, seed = seed, re_formula = NA)
confidence_udata_predictive_data$alg <- paste(confidence_udata_predictive_data$search, confidence_udata_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "search", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$confidence_udata$plot

Differences in user score by oracle.

oracle_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "oracle", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$confidence_udata$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

confidence_udata_predictive_data_subset <- subset(confidence_udata_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))

alg_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data_subset, "alg", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$confidence_udata$plot

Differences in user score by participant group

participant_group_differences$confidence_udata <- user_response_diff_plot(confidence_udata_predictive_data, "participant_group", "confidence.udata", "Difference in Confidence in Understanding Data Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$confidence_udata$plot

Confidence in Answer: Building a Model

models$confidence_ans <- brm(
    formula = bf(confidence.ans ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
    prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/confidence_ans",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$confidence_ans)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: confidence.ans ~ oracle * search + dataset + task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 288) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 72) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.56      0.13     0.31     0.82 1.00      826     1264
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]                -3.28      0.41    -4.09    -2.49 1.00     2880
## Intercept[2]                -2.46      0.31    -3.08    -1.86 1.00     2848
## Intercept[3]                -1.61      0.29    -2.19    -1.04 1.00     3037
## Intercept[4]                 0.16      0.27    -0.37     0.70 1.00     2751
## oracleDziban                 0.23      0.27    -0.30     0.76 1.00     2231
## searchDFS                    0.06      0.27    -0.46     0.58 1.00     2490
## datasetmovies               -0.16      0.19    -0.53     0.22 1.00     2179
## task2.RetrieveValue         -0.29      0.20    -0.69     0.11 1.00     3629
## task3.Prediction            -1.08      0.21    -1.48    -0.68 1.00     3447
## task4.Exploration           -0.69      0.20    -1.08    -0.30 1.00     3787
## participant_groupstudent     0.13      0.20    -0.27     0.53 1.00     2701
## oracleDziban:searchDFS      -0.04      0.38    -0.78     0.72 1.00     2197
##                          Tail_ESS
## Intercept[1]                 1879
## Intercept[2]                 2088
## Intercept[3]                 2300
## Intercept[4]                 2254
## oracleDziban                 2340
## searchDFS                    2382
## datasetmovies                2254
## task2.RetrieveValue          2275
## task3.Prediction             2568
## task4.Exploration            2632
## participant_groupstudent     2006
## oracleDziban:searchDFS       1791
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$confidence_ans)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$confidence_ans,
  pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for confidence in answer using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

confidence_ans_plot <- user_response_posterior_draws_plot(user_response_data, models$confidence_ans, NULL, "Oracle/Search Combination", "Rating")
confidence_ans_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

confidence_ans_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL   1.06  0.764   1.32   0.95 mean   qi       
## 2 BFS    Dziban      1.20  0.903   1.46   0.95 mean   qi       
## 3 DFS    CompassQL   1.09  0.778   1.38   0.95 mean   qi       
## 4 DFS    Dziban      1.21  0.931   1.44   0.95 mean   qi       
## 5 BFS    CompassQL   1.06  0.958   1.15   0.5  mean   qi       
## 6 BFS    Dziban      1.20  1.11    1.29   0.5  mean   qi       
## 7 DFS    CompassQL   1.09  1       1.19   0.5  mean   qi       
## 8 DFS    Dziban      1.21  1.12    1.29   0.5  mean   qi
## Saving 7 x 5 in image

Confidence in Answer: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

confidence_ans_predictive_data <- user_response_data %>% add_predicted_draws(models$confidence_ans, seed = seed, re_formula = NA) 
confidence_ans_predictive_data$alg <- paste(confidence_ans_predictive_data$search, confidence_ans_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "search", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$confidence_ans$plot

Differences in user score by oracle.

oracle_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "oracle", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$confidence_ans$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

confidence_ans_predictive_data_subset <- subset(confidence_ans_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data_subset, "alg", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$confidence_ans$plot

Differences in user score by participant group

participant_group_differences$confidence_ans <- user_response_diff_plot(confidence_ans_predictive_data, "participant_group", "confidence.ans", "Difference in Confidence in Answer Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$confidence_ans$plot

Efficiency: Building a Model

filename = "efficiency"
models$efficiency <- brm(
    formula = bf(efficiency ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/efficiency",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$efficiency)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: efficiency ~ oracle * search + dataset + task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 288) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 72) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.13      0.15     0.87     1.46 1.00     1024     1546
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]                -2.65      0.43    -3.50    -1.82 1.00     1207
## Intercept[2]                -1.05      0.40    -1.83    -0.26 1.00     1179
## Intercept[3]                -0.22      0.40    -1.01     0.59 1.00     1174
## Intercept[4]                 1.04      0.40     0.25     1.86 1.00     1179
## oracleDziban                -0.13      0.43    -0.98     0.72 1.00      856
## searchDFS                   -1.26      0.43    -2.09    -0.44 1.00      964
## datasetmovies                0.18      0.30    -0.40     0.79 1.00     1266
## task2.RetrieveValue         -0.27      0.18    -0.62     0.08 1.00     3519
## task3.Prediction             0.26      0.19    -0.10     0.63 1.00     4036
## task4.Exploration            0.38      0.19     0.01     0.76 1.00     3832
## participant_groupstudent     0.16      0.31    -0.47     0.79 1.00     1234
## oracleDziban:searchDFS       0.81      0.60    -0.38     1.98 1.00      918
##                          Tail_ESS
## Intercept[1]                 1370
## Intercept[2]                 1451
## Intercept[3]                 1476
## Intercept[4]                 1613
## oracleDziban                 1517
## searchDFS                    1315
## datasetmovies                1662
## task2.RetrieveValue          2602
## task3.Prediction             2581
## task4.Exploration            2477
## participant_groupstudent     1335
## oracleDziban:searchDFS       1601
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$efficiency)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$efficiency,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for efficiency using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

efficiency_plot <- user_response_posterior_draws_plot(user_response_data, models$efficiency, NULL, "Oracle/Search Combination", "Rating")
efficiency_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

efficiency_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower  .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>   <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL  0.797  0.264  1.29     0.95 mean   qi       
## 2 BFS    Dziban     0.693  0.166  1.18     0.95 mean   qi       
## 3 DFS    CompassQL -0.262 -0.764  0.278    0.95 mean   qi       
## 4 DFS    Dziban     0.318 -0.25   0.834    0.95 mean   qi       
## 5 BFS    CompassQL  0.797  0.625  0.972    0.5  mean   qi       
## 6 BFS    Dziban     0.693  0.528  0.875    0.5  mean   qi       
## 7 DFS    CompassQL -0.262 -0.444 -0.0833   0.5  mean   qi       
## 8 DFS    Dziban     0.318  0.125  0.514    0.5  mean   qi
## Saving 7 x 5 in image

Efficiency: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

efficiency_predictive_data <- user_response_data %>% add_predicted_draws(models$efficiency, seed = seed, re_formula = NA) 
efficiency_predictive_data$alg <- paste(efficiency_predictive_data$search, efficiency_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "search", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$efficiency$plot

Differences in user score by oracle.

oracle_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "oracle", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$efficiency$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

efficiency_predictive_data_data_subset <- subset(efficiency_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data_data_subset, "alg", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$efficiency$plot

Differences in user score by participant group

participant_group_differences$efficiency <- user_response_diff_plot(efficiency_predictive_data, "participant_group", "efficiency", "Difference in Efficiency Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$efficiency$plot

Ease of Use: Building a Model

models$ease_of_use <- brm(
    formula = bf(ease.of.use ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/ease_of_use",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$ease_of_use)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: ease.of.use ~ oracle * search + dataset + task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 288) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 72) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.05      0.14     0.79     1.36 1.00     1050     1608
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]                -2.68      0.42    -3.55    -1.88 1.00     1334
## Intercept[2]                -1.25      0.37    -1.95    -0.53 1.00     1221
## Intercept[3]                -0.30      0.37    -1.01     0.42 1.00     1189
## Intercept[4]                 1.62      0.38     0.90     2.35 1.00     1276
## oracleDziban                -0.32      0.41    -1.13     0.48 1.00      962
## searchDFS                   -1.27      0.40    -2.07    -0.48 1.00      953
## datasetmovies                0.31      0.29    -0.24     0.88 1.00     1018
## task2.RetrieveValue          0.19      0.19    -0.18     0.56 1.00     3365
## task3.Prediction             0.27      0.18    -0.09     0.62 1.00     2785
## task4.Exploration            0.38      0.19     0.02     0.74 1.00     3326
## participant_groupstudent     0.47      0.29    -0.10     1.03 1.00     1071
## oracleDziban:searchDFS       0.79      0.56    -0.28     1.91 1.00      902
##                          Tail_ESS
## Intercept[1]                 1616
## Intercept[2]                 1643
## Intercept[3]                 1771
## Intercept[4]                 1565
## oracleDziban                 1405
## searchDFS                    1502
## datasetmovies                1490
## task2.RetrieveValue          2390
## task3.Prediction             2373
## task4.Exploration            2453
## participant_groupstudent     1417
## oracleDziban:searchDFS       1451
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$ease_of_use)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$ease_of_use,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for ease of use using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

ease_of_use_plot <- user_response_posterior_draws_plot(user_response_data, models$ease_of_use, NULL, "Oracle/Search Combination", "Rating")
ease_of_use_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

ease_of_use_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL 0.933   0.555   1.28    0.95 mean   qi       
## 2 BFS    Dziban    0.738   0.319   1.11    0.95 mean   qi       
## 3 DFS    CompassQL 0.0732 -0.403   0.528   0.95 mean   qi       
## 4 DFS    Dziban    0.418  -0.0417  0.806   0.95 mean   qi       
## 5 BFS    CompassQL 0.933   0.806   1.06    0.5  mean   qi       
## 6 BFS    Dziban    0.738   0.611   0.875   0.5  mean   qi       
## 7 DFS    CompassQL 0.0732 -0.0833  0.236   0.5  mean   qi       
## 8 DFS    Dziban    0.418   0.264   0.569   0.5  mean   qi
## Saving 7 x 5 in image

Ease of Use: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

ease_of_use_predictive_data <- user_response_data %>% add_predicted_draws(models$ease_of_use, seed = seed, re_formula = NA) 
ease_of_use_predictive_data$alg <- paste(ease_of_use_predictive_data$search, ease_of_use_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "search", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$ease_of_use$plot

Differences in user score by oracle.

oracle_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "oracle", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$ease_of_use$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

ease_of_use_predictive_data_subset <- subset(ease_of_use_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))

alg_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data_subset, "alg", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$ease_of_use$plot

Differences in user score by participant group

participant_group_differences$ease_of_use <- user_response_diff_plot(ease_of_use_predictive_data, "participant_group", "ease.of.use", "Difference in Ease of Use Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$ease_of_use$plot

Utility: Building a Model

models$utility <- brm(
    formula = bf(utility ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/utility",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$utility)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: utility ~ oracle * search + dataset + task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 288) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 72) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     0.95      0.14     0.70     1.24 1.00      968     1571
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]                -1.77      0.36    -2.48    -1.07 1.00     1434
## Intercept[2]                -0.61      0.35    -1.30     0.10 1.00     1435
## Intercept[3]                 0.05      0.35    -0.62     0.75 1.00     1440
## Intercept[4]                 1.38      0.35     0.71     2.11 1.00     1328
## oracleDziban                 0.02      0.37    -0.68     0.75 1.00     1075
## searchDFS                   -0.83      0.37    -1.60    -0.14 1.00      947
## datasetmovies                0.22      0.27    -0.29     0.75 1.00      961
## task2.RetrieveValue         -0.16      0.18    -0.50     0.18 1.00     3620
## task3.Prediction             0.35      0.18    -0.01     0.71 1.00     3280
## task4.Exploration            0.59      0.19     0.24     0.96 1.00     3558
## participant_groupstudent     0.20      0.26    -0.30     0.72 1.00     1314
## oracleDziban:searchDFS       0.44      0.53    -0.58     1.48 1.00      979
##                          Tail_ESS
## Intercept[1]                 1586
## Intercept[2]                 1493
## Intercept[3]                 1468
## Intercept[4]                 1557
## oracleDziban                 1594
## searchDFS                    1305
## datasetmovies                1228
## task2.RetrieveValue          2528
## task3.Prediction             2193
## task4.Exploration            1858
## participant_groupstudent     1678
## oracleDziban:searchDFS       1486
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$utility)

s plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$utility,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for Utility using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

utility_plot <- user_response_posterior_draws_plot(user_response_data, models$utility, NULL, "Oracle/Search Combination", "Rating")
utility_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

utility_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL  0.624  0.111 1.11     0.95 mean   qi       
## 2 BFS    Dziban     0.640  0.111 1.12     0.95 mean   qi       
## 3 DFS    CompassQL -0.148 -0.694 0.375    0.95 mean   qi       
## 4 DFS    Dziban     0.288 -0.25  0.819    0.95 mean   qi       
## 5 BFS    CompassQL  0.624  0.458 0.792    0.5  mean   qi       
## 6 BFS    Dziban     0.640  0.472 0.819    0.5  mean   qi       
## 7 DFS    CompassQL -0.148 -0.333 0.0417   0.5  mean   qi       
## 8 DFS    Dziban     0.288  0.111 0.472    0.5  mean   qi
## Saving 7 x 5 in image

Utility: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

utility_predictive_data <- user_response_data %>% add_predicted_draws(models$utility, seed = seed, re_formula = NA) 
utility_predictive_data$alg <- paste(utility_predictive_data$search, utility_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$utility <- user_response_diff_plot(utility_predictive_data, "search", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$utility$plot

Differences in user score by oracle.

oracle_differences$utility <- user_response_diff_plot(utility_predictive_data, "oracle", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$utility$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

utility_predictive_data_subset <- subset(utility_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$utility <- user_response_diff_plot(utility_predictive_data_subset, "alg", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$utility$plot

Differences in user score by participant group

participant_group_differences$utility <- user_response_diff_plot(utility_predictive_data, "participant_group", "utility", "Difference in Utility Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$utility$plot

Overall: Building a Model

models$overall <- brm(
    formula = bf(overall ~ oracle * search + dataset + task + participant_group + (1 | participant_id)),
    family = cumulative("probit"),
   prior = prior(normal(0.26, 1.26), class = Intercept),
    chains = 2,
    cores = 2,
    iter = 2500,
    warmup = 1000,
    data = user_response_data,
    control = list(adapt_delta = 0.99),
    file = "models/overall",
    seed = seed
  )

Check some diagnostics regarding our model. Rhat should be close to 1 and Bulk_ESS should be in the thousands.

summary(models$overall)
##  Family: cumulative 
##   Links: mu = probit; disc = identity 
## Formula: overall ~ oracle * search + dataset + task + participant_group + (1 | participant_id) 
##    Data: user_response_data (Number of observations: 288) 
## Samples: 2 chains, each with iter = 2500; warmup = 1000; thin = 1;
##          total post-warmup samples = 3000
## 
## Group-Level Effects: 
## ~participant_id (Number of levels: 72) 
##               Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept)     1.41      0.18     1.09     1.79 1.00     1128     1715
## 
## Population-Level Effects: 
##                          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept[1]                -2.83      0.53    -3.84    -1.76 1.00      918
## Intercept[2]                -1.46      0.49    -2.43    -0.44 1.00      869
## Intercept[3]                -0.22      0.48    -1.15     0.76 1.00      878
## Intercept[4]                 1.96      0.50     1.00     3.00 1.00      933
## oracleDziban                 0.01      0.52    -1.04     1.01 1.00      713
## searchDFS                   -0.98      0.53    -2.07     0.02 1.00      799
## datasetmovies               -0.06      0.36    -0.80     0.65 1.00      740
## task2.RetrieveValue         -0.09      0.19    -0.46     0.26 1.00     3886
## task3.Prediction             0.39      0.19     0.02     0.76 1.00     3125
## task4.Exploration            0.61      0.20     0.22     1.01 1.00     3503
## participant_groupstudent     0.48      0.36    -0.21     1.22 1.00      982
## oracleDziban:searchDFS       0.70      0.75    -0.75     2.21 1.01      672
##                          Tail_ESS
## Intercept[1]                 1519
## Intercept[2]                 1347
## Intercept[3]                 1197
## Intercept[4]                 1338
## oracleDziban                  967
## searchDFS                    1243
## datasetmovies                1009
## task2.RetrieveValue          2836
## task3.Prediction             2396
## task4.Exploration            2506
## participant_groupstudent     1286
## oracleDziban:searchDFS       1023
## 
## Family Specific Parameters: 
##      Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## disc     1.00      0.00     1.00     1.00 1.00     3000     3000
## 
## Samples were drawn using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).

Trace plots help us check whether there is evidence of non-convergence for our model.

# plot(models$overall)

In our pairs plots, we want to make sure we don’t have highly correlated parameters (highly correlated parameters means that our model has difficulty differentiating the effect of such parameters).

pairs(
  models$overall,
   pars = c("b_datasetmovies",
           "b_oracledziban",
           "b_searchdfs",
           "b_task2.RetrieveValue",
           "b_task3.Prediction",
           "b_task4.Exploration"),
  fixed = TRUE
)

We now look at a response for Overall using different combinations of search and oracle via draws from the model posterior. The thicker, shorter line represents the 95% credible interval, while the thinner, longer line represents the 50% credible interval.

overall_plot <- user_response_posterior_draws_plot(user_response_data, models$overall, NULL, "Oracle/Search Combination", "Rating")
overall_plot$plot

We can get the numeric values of the interval boundaries shown above with mean_qi

overall_plot$intervals
## # A tibble: 8 x 8
## # Groups:   search [2]
##   search oracle    rating  .lower .upper .width .point .interval
##   <fct>  <fct>      <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS    CompassQL  0.768  0.319   1.15    0.95 mean   qi       
## 2 BFS    Dziban     0.775  0.361   1.14    0.95 mean   qi       
## 3 DFS    CompassQL  0.182 -0.333   0.639   0.95 mean   qi       
## 4 DFS    Dziban     0.621  0.181   1.03    0.95 mean   qi       
## 5 BFS    CompassQL  0.768  0.639   0.917   0.5  mean   qi       
## 6 BFS    Dziban     0.775  0.639   0.917   0.5  mean   qi       
## 7 DFS    CompassQL  0.182  0.0139  0.361   0.5  mean   qi       
## 8 DFS    Dziban     0.621  0.486   0.778   0.5  mean   qi
## Saving 7 x 5 in image

Overall: Differences Between Conditions

Next, we want to see if there is any significant difference in completion time between the two search algorithms (bfs and dfs) and the two oracles (dzbian and CompassQL).

overall_predictive_data <- user_response_data %>% add_predicted_draws(models$overall, seed = seed, re_formula = NA) 
overall_predictive_data$alg <- paste(overall_predictive_data$search, overall_predictive_data$oracle)

Differences in user score by search algorithm.

search_differences$overall <- user_response_diff_plot(overall_predictive_data, "search", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'search', 'task' (override with `.groups` argument)
search_differences$overall$plot

Differences in user score by oracle.

oracle_differences$overall <- overall_predictive_data %>% 
  group_by(oracle, .draw) %>%
   summarize(rating = weighted.mean(as.numeric(.prediction))) %>%
   compare_levels(rating, by = oracle) %>%
   rename(diff_in_rating = rating)
## `summarise()` regrouping output by 'oracle' (override with `.groups` argument)
oracle_differences$overall$metric = "overall"

oracle_differences$overall %>%
      ggplot(aes(x = diff_in_rating, y = "overall")) +
      xlab(paste0("Expected Difference in Rating (",oracle_differences$overall[1,'oracle'],")")) + 
      ylab("Condition")+
      stat_halfeye(.width = c(.95, .5)) +
      geom_vline(xintercept = 0, linetype = "longdash") +
      theme_minimal()

oracle_differences$overall <- user_response_diff_plot(overall_predictive_data, "oracle", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'oracle', 'task' (override with `.groups` argument)
oracle_differences$overall$plot

Differences in user score by search and oracle combination (DFS CompassQL vs BFS Dziban only)

overall_predictive_data_subset <- subset(overall_predictive_data, alg %in% c("DFS CompassQL", "BFS Dziban"))
alg_differences$overall <- user_response_diff_plot(overall_predictive_data_subset, "alg", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'alg', 'task' (override with `.groups` argument)
alg_differences$overall$plot

Differences in user score by participant group

participant_group_differences$overall <- user_response_diff_plot(overall_predictive_data, "participant_group", "overall", "Difference in Overall Rating", "Task", NULL)
## `summarise()` regrouping output by 'participant_group', 'task' (override with `.groups` argument)
participant_group_differences$overall$plot

Summary Plots

Putting the all of the plots for search algorithm and oracle differences together, split by whether the rating metric is of type confidence or preference We’ll start with differences in search algorithms.

Differences in Search Algorithms

combined_search_differences <- rbind(
  search_differences$confidence_udata$differences, 
  search_differences$confidence_ans$differences, 
  search_differences$efficiency$differences,
  search_differences$ease_of_use$differences, 
  search_differences$utility$differences, 
  search_differences$overall$differences)
search_difference_plots_intervals <- user_response_diff_summary(combined_search_differences, 'search')
search_difference_plots_intervals$plot_confidence

View intervals

search_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups:   search [1]
##   search    metric             difference  .lower .upper .width .point .interval
##   <chr>     <fct>                   <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS - DFS Answer                -0.0224 -0.417   0.361   0.95 mean   qi       
## 2 BFS - DFS Understanding Data     0.118  -0.278   0.528   0.95 mean   qi       
## 3 BFS - DFS Answer                -0.0224 -0.167   0.111   0.5  mean   qi       
## 4 BFS - DFS Understanding Data     0.118  -0.0278  0.25    0.5  mean   qi
search_difference_plots_intervals$plot_preference

View intervals

search_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups:   search [1]
##   search    metric      difference  .lower .upper .width .point .interval
##   <chr>     <fct>            <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS - DFS Overall          0.370 -0.139   0.889   0.95 mean   qi       
## 2 BFS - DFS Utility          0.562 -0.0833  1.22    0.95 mean   qi       
## 3 BFS - DFS Ease of Use      0.590  0.0833  1.11    0.95 mean   qi       
## 4 BFS - DFS Efficiency       0.717  0.0833  1.33    0.95 mean   qi       
## 5 BFS - DFS Overall          0.370  0.194   0.556   0.5  mean   qi       
## 6 BFS - DFS Utility          0.562  0.333   0.778   0.5  mean   qi       
## 7 BFS - DFS Ease of Use      0.590  0.417   0.778   0.5  mean   qi       
## 8 BFS - DFS Efficiency       0.717  0.5     0.944   0.5  mean   qi

Differences in Oracle

combined_oracle_differences <- rbind(
  oracle_differences$confidence_udata$differences, 
  oracle_differences$confidence_ans$differences, 
  oracle_differences$efficiency$differences,
  oracle_differences$ease_of_use$differences, 
  oracle_differences$utility$differences, 
  oracle_differences$overall$differences)
oracle_difference_plots_intervals <- user_response_diff_summary(combined_oracle_differences, 'oracle')
oracle_difference_plots_intervals$plot_confidence

View intervals

oracle_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups:   oracle [1]
##   oracle         metric        difference  .lower .upper .width .point .interval
##   <chr>          <fct>              <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 Dziban - Comp… Answer             0.125 -0.25    0.528   0.95 mean   qi       
## 2 Dziban - Comp… Understandin…      0.196 -0.194   0.583   0.95 mean   qi       
## 3 Dziban - Comp… Answer             0.125  0       0.25    0.5  mean   qi       
## 4 Dziban - Comp… Understandin…      0.196  0.0556  0.333   0.5  mean   qi
oracle_difference_plots_intervals$plot_preference

View intervals

oracle_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups:   oracle [1]
##   oracle            metric     difference  .lower .upper .width .point .interval
##   <chr>             <fct>           <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 Dziban - Compass… Overall        0.223  -0.278   0.75    0.95 mean   qi       
## 2 Dziban - Compass… Utility        0.226  -0.417   0.861   0.95 mean   qi       
## 3 Dziban - Compass… Ease of U…     0.0752 -0.444   0.583   0.95 mean   qi       
## 4 Dziban - Compass… Efficiency     0.238  -0.417   0.889   0.95 mean   qi       
## 5 Dziban - Compass… Overall        0.223   0.0556  0.389   0.5  mean   qi       
## 6 Dziban - Compass… Utility        0.226   0       0.444   0.5  mean   qi       
## 7 Dziban - Compass… Ease of U…     0.0752 -0.111   0.25    0.5  mean   qi       
## 8 Dziban - Compass… Efficiency     0.238   0.0278  0.472   0.5  mean   qi

DFS CompassQL vs BFS Dziban

combined_alg_differences <- rbind(
  alg_differences$confidence_udata$differences, 
  alg_differences$confidence_ans$differences, 
  alg_differences$efficiency$differences,
  alg_differences$ease_of_use$differences, 
  alg_differences$utility$differences, 
  alg_differences$overall$differences)
alg_difference_plots_intervals <- user_response_diff_summary(combined_alg_differences, 'alg')
alg_difference_plots_intervals$plot_confidence

View intervals

alg_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups:   alg [1]
##   alg              metric       difference .lower .upper .width .point .interval
##   <chr>            <fct>             <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS Dziban - DF… Answer            0.103 -0.444  0.667   0.95 mean   qi       
## 2 BFS Dziban - DF… Understandi…      0.314 -0.278  0.889   0.95 mean   qi       
## 3 BFS Dziban - DF… Answer            0.103 -0.111  0.278   0.5  mean   qi       
## 4 BFS Dziban - DF… Understandi…      0.314  0.111  0.5     0.5  mean   qi
alg_difference_plots_intervals$plot_preference

View intervals

alg_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups:   alg [1]
##   alg                metric    difference  .lower .upper .width .point .interval
##   <chr>              <fct>          <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 BFS Dziban - DFS … Overall        0.593 -0.167   1.33    0.95 mean   qi       
## 2 BFS Dziban - DFS … Utility        0.788 -0.111   1.72    0.95 mean   qi       
## 3 BFS Dziban - DFS … Ease of …      0.665 -0.0556  1.44    0.95 mean   qi       
## 4 BFS Dziban - DFS … Efficien…      0.955  0.0556  1.83    0.95 mean   qi       
## 5 BFS Dziban - DFS … Overall        0.593  0.333   0.833   0.5  mean   qi       
## 6 BFS Dziban - DFS … Utility        0.788  0.5     1.11    0.5  mean   qi       
## 7 BFS Dziban - DFS … Ease of …      0.665  0.389   0.944   0.5  mean   qi       
## 8 BFS Dziban - DFS … Efficien…      0.955  0.667   1.28    0.5  mean   qi

Differences in Participant Group

combined_participant_group_differences <- rbind(
  participant_group_differences$confidence_udata$differences, 
  participant_group_differences$confidence_ans$differences, 
  participant_group_differences$efficiency$differences,
  participant_group_differences$ease_of_use$differences, 
  participant_group_differences$utility$differences, 
  participant_group_differences$overall$differences)
participant_group_difference_plots_intervals <- user_response_diff_summary(combined_participant_group_differences, 'participant_group')
participant_group_difference_plots_intervals$plot_confidence

View intervals

participant_group_difference_plots_intervals$intervals_confidence
## # A tibble: 4 x 8
## # Groups:   participant_group [1]
##   participant_group  metric    difference  .lower .upper .width .point .interval
##   <chr>              <fct>          <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 student - profess… Answer        0.0752 -0.319   0.481   0.95 mean   qi       
## 2 student - profess… Understa…     0.0878 -0.294   0.469   0.95 mean   qi       
## 3 student - profess… Answer        0.0752 -0.0563  0.206   0.5  mean   qi       
## 4 student - profess… Understa…     0.0878 -0.0438  0.219   0.5  mean   qi
participant_group_difference_plots_intervals$plot_preference

View intervals

participant_group_difference_plots_intervals$intervals_preference
## # A tibble: 8 x 8
## # Groups:   participant_group [1]
##   participant_group   metric   difference  .lower .upper .width .point .interval
##   <chr>               <fct>         <dbl>   <dbl>  <dbl>  <dbl> <chr>  <chr>    
## 1 student - professi… Overall       0.278 -0.231   0.825   0.95 mean   qi       
## 2 student - professi… Utility       0.183 -0.456   0.831   0.95 mean   qi       
## 3 student - professi… Ease of…      0.318 -0.2     0.837   0.95 mean   qi       
## 4 student - professi… Efficie…      0.133 -0.513   0.794   0.95 mean   qi       
## 5 student - professi… Overall       0.278  0.1     0.45    0.5  mean   qi       
## 6 student - professi… Utility       0.183 -0.0375  0.400   0.5  mean   qi       
## 7 student - professi… Ease of…      0.318  0.144   0.494   0.5  mean   qi       
## 8 student - professi… Efficie…      0.133 -0.0938  0.356   0.5  mean   qi

Histograms for Response Distributions

Here we plot out the number of responses for each rating (-2 to 2 inclusive) across all of our user metrics (Confidence in Understanding Data, Confidence in Answer, Efficiency, Ease of Use, Utility, and Overall). Because each user completed 4 tasks, the total number of responses in these graphs is four times the total number of users in our study.

user_response_data$dataset<- gsub('birdstrikes', 'Birdstrikes', user_response_data$dataset)
user_response_data$dataset<- gsub('movies', 'Movies', user_response_data$dataset)

user_response_data %>%
  ggplot(aes(x=confidence.udata)) +
    geom_bar() +
    xlab("Confidence in Understanding Data Rating") +
    ylab("Number of Responses") +
    facet_grid(dataset ~ search+oracle)

user_response_data %>%
  ggplot(aes(x=confidence.ans)) +
    geom_bar() +
    xlab("Confidence in Answer Rating") +
    ylab("Number of Responses") +
    facet_grid(dataset ~ search+oracle)

user_response_data %>%
  ggplot(aes(x=efficiency)) +
    geom_bar() +
    xlab("Efficiency Rating") +
    ylab("Number of Responses") +
    facet_grid(dataset ~ search+oracle)

user_response_data %>%
  ggplot(aes(x=ease.of.use)) +
    geom_bar() +
    xlab("Ease of Use Rating") +
    ylab("Number of Responses") +
    facet_grid(dataset ~ search+oracle)

user_response_data %>%
  ggplot(aes(x=utility)) +
    geom_bar() +
    xlab("Utility Rating") +
    ylab("Number of Responses") +
    facet_grid(dataset ~ search+oracle)

user_response_data %>%
  ggplot(aes(x=overall)) +
    geom_bar() +
    xlab("Overall Rating") +
    ylab("Number of Responses") +
    facet_grid(dataset ~ search+oracle)